load libs

library(splines)
library(stringr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(DBI)

Load exploration

nhanes_db <- dbConnect(RSQLite::SQLite(), "nhanes.sqlite")

# list all of the tables
dbListTables(nhanes_db)
## [1] "blood_cholesterol"     "body_measures"         "current_health_status"
## [4] "demo"                  "diabetes"              "diet_total"           
## [7] "medical_conditions"    "merged_table"          "var_decr"
cols <- 'BMXWAIST , RIDAGEYR, BMXHT, BMXBMI, BMXWT, RIAGENDR, years, DR1TM161, WTDRD1, BMXLEG, BMXARML '
data_sql <- paste0('SELECT ', cols, 'FROM merged_table')

dbListTables(nhanes_db)
## [1] "blood_cholesterol"     "body_measures"         "current_health_status"
## [4] "demo"                  "diabetes"              "diet_total"           
## [7] "medical_conditions"    "merged_table"          "var_decr"
data <- dbGetQuery(nhanes_db, data_sql)
data <- na.omit(data)

dbDisconnect(nhanes_db)


train_ix <- sample(x = 1:nrow(data), size = 6000)
test_ix <- sample(x = setdiff(1:nrow(data), train_ix), 3000)

train_data <- data[train_ix, ]
test_data <- data[test_ix, ]

Inverse Normal Distribution

invNorm <- function(x) {qnorm((rank(x) - 3/8)/(length(x) +1 - 6/8))}

mean_square_error <- function(y_true, y_pred){
    round(mean((y_true - y_pred)^2),4)
}

plot_density <- function(data,data_name,col='red'){
    d <- density(data)
    plot(d,main=paste(data_name,"Density"))
    polygon(d, col=col, border="blue")
  }

Load exploration

hist(data$RIDAGEYR)

ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST,colour=RIAGENDR)) + 
  geom_point(alpha=0.1)+
  geom_smooth(method = lm, formula = y ~ x)

ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST,colour=RIAGENDR)) + 
  geom_point(alpha=0.1)+
  geom_smooth(method = lm, formula = y ~ bs(x, df=7), size = 1)

ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST,colour=RIAGENDR)) + 
  geom_point(alpha=0.1)+
  geom_smooth(method = lm, formula = y ~ bs(x, df=7), size = 1)+
  geom_smooth(method = lm, formula = y ~ x)

ggplot(data, aes(x=RIDAGEYR, y=BMXWAIST) ) + 
  geom_point(aes(colour=RIAGENDR),alpha=0.2)+
  geom_density_2d()

ggplot(data, aes(x=BMXWT, y=BMXWAIST,colour=RIAGENDR) ) + 
  geom_point(alpha=0.1)+geom_smooth(method = lm,formula = y ~ x)

ggplot(data, aes(x=BMXWT, y=BMXWAIST) ) + 
  geom_point(aes(colour=RIAGENDR),alpha=0.1)+
  geom_density_2d()

ggplot(data, aes(x=BMXBMI, y=BMXWAIST) ) + 
  geom_point(aes(colour=RIAGENDR),alpha=0.1)+
  geom_density_2d()

Regression models

lm_reg = lm(formula = BMXWAIST ~ BMXWT, train_data)
print(summary(lm_reg))
## 
## Call:
## lm(formula = BMXWAIST ~ BMXWT, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.4764  -5.0999  -0.1621   4.9584  29.3024 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 43.704013   0.382114   114.4   <2e-16 ***
## BMXWT        0.684642   0.004572   149.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.257 on 5998 degrees of freedom
## Multiple R-squared:  0.789,  Adjusted R-squared:  0.789 
## F-statistic: 2.243e+04 on 1 and 5998 DF,  p-value: < 2.2e-16
train_res_df <- data.frame(
  residuals <- lm_reg$residuals,
  age <- train_data$RIDAGEYR,
  sex <- train_data$RIAGENDR
  )




# tags <- c("[21-30)","[30-40)", "[40-50)", "[50-60)", "[60-70)", "[70-80)","[80,120)")
# train_res_df <- train_res_df |>
#   mutate(tag = case_when(
#     age < 30 ~ tags[1],
#     age >= 30 & age < 40 ~ tags[2],
#     age >= 40 & age < 50 ~ tags[3],
#     age >= 50 & age < 60 ~ tags[4],
#     age >= 60 & age < 70 ~ tags[5],
#     age >= 70 & age < 80 ~ tags[6],
#     age >= 80 ~ tags[7],
# 
#     )) |> arrange(age)


#create bins for age
tags <- c("[21-25)","[25-30)", "[30-35)", "[35-40)","[40-45)", "[45-50)", "[50-55)",
          "[55,60)","[60-65)", "[65-70)", "[70-75)", "[75-80)", "[80-85)","[85,120)")
train_res_df <- train_res_df |>
  mutate(tag = case_when(
    age < 25 ~ tags[1],
    age >= 25 & age < 30 ~ tags[2],
    age >= 30 & age < 35 ~ tags[3],
    age >= 35 & age < 40 ~ tags[4],
    age >= 40 & age < 45 ~ tags[5],
    age >= 45 & age < 50 ~ tags[6],
    age >= 50 & age < 55 ~ tags[7],
    age >= 55 & age < 60 ~ tags[8],
    age >= 60 & age < 65 ~ tags[9],
    age >= 65 & age < 70 ~ tags[10],
    age >= 70 & age < 75 ~ tags[11],
    age >= 75 & age < 80 ~ tags[12],
    age >= 80 & age < 85 ~ tags[13],
    age >= 85 ~ tags[14],

    )) |> arrange(age)
train_res_df$tag <- as.factor(train_res_df$tag)



ggplot(data = train_res_df, mapping = aes(x=tag,y=residuals,color=sex)) +
  geom_jitter(alpha=0.2)+
  geom_boxplot(fill="bisque",color="black",alpha=0.3)+
  stat_summary(fun = mean, color="blue", shape=20,size=1)+
  theme(axis.text.x = element_text(angle = 45))
## Warning: Removed 14 rows containing missing values (geom_segment).

test_df <- test_data |> select(-BMXWAIST)
lm_pred = predict(lm_reg, newdata = test_df, se = T)

# save prediction results
pred_df = data.frame(
  fit = lm_pred$fit,
  lwr = lm_pred$fit - 2 * lm_pred$se.fit,
  upr = lm_pred$fit + 2 * lm_pred$se.fit,
  weight = test_data$BMXWT,
  sex = test_data$RIAGENDR,
  age <- train_data$RIDAGEYR,
  label = test_data$BMXWAIST
)
## Warning in data.frame(fit = lm_pred$fit, lwr = lm_pred$fit - 2 *
## lm_pred$se.fit, : row names were found from a short variable and have been
## discarded
ggplot(pred_df, aes(x = weight, y = label,color=sex) ) +
     geom_point(alpha=0.2) +
     geom_ribbon( aes(ymin = lwr, ymax = upr, fill = sex), alpha = .15) +
     geom_line(aes(y = fit), size = 1)

Regression models with age

lm_reg = lm(formula = BMXWAIST ~ BMXWT + RIDAGEYR, train_data)

lm_pred = predict(lm_reg, newdata = test_df, se = T)

lm_reg2 = lm(formula = BMXWAIST ~ BMXWT + bs(RIDAGEYR,df=7), train_data)
lm_pred2 = predict(lm_reg2, newdata = test_df, se = T)



# save prediction results
pred_df = data.frame(
  fit1 = lm_pred$fit,
  fit2 = lm_pred2$fit,
  weight = test_data$BMXWT,
  sex = test_data$RIAGENDR,
  age <- train_data$RIDAGEYR,
  label = test_data$BMXWAIST
)
## Warning in data.frame(fit1 = lm_pred$fit, fit2 = lm_pred2$fit, weight =
## test_data$BMXWT, : row names were found from a short variable and have been
## discarded
ggplot(pred_df, aes(x = weight, y = label)) + geom_point(colour = "black",alpha = 0.1) +
  geom_point(aes(y=fit1,x= weight,color=sex),alpha = 0.3)

ggplot(pred_df, aes(x = weight, y = label)) + geom_point(colour = "black",alpha = 0.1) +
  geom_point(data=pred_df,aes(y=fit1,x= weight,colour="lm"),alpha = 0.1)+
  geom_point(data=pred_df,aes(y=fit2,x= weight,colour="lm_bs"),alpha = 0.1) +
  scale_color_manual(name = "models", values = c("lm" = "darkblue", "lm_bs" = "red"))

qplot(x=fit1,y=fit2,data=pred_df,colour=sex,alpha=I(0.1))